{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Conver the original data\n",
    "\n",
    "obtained from https://www.transtats.bts.gov/DL_SelectFields.asp\n",
    "\n",
    "on Sunday, 8-Sept-2019"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Converting the original airlines data to `hdf5`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-19T20:16:31.517614Z",
     "start_time": "2019-09-19T20:16:29.445382Z"
    }
   },
   "outputs": [],
   "source": [
    "import os \n",
    "import glob\n",
    "from zipfile import ZipFile\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "import pandas as pd\n",
    "pd.set_option('display.max_columns', 500)\n",
    "\n",
    "import vaex\n",
    "\n",
    "from tqdm import  tqdm_notebook as tqdm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-19T10:09:42.429415Z",
     "start_time": "2019-09-19T10:09:42.422859Z"
    }
   },
   "outputs": [],
   "source": [
    "# Columns to read\n",
    "columns = ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', \n",
    "           'Reporting_Airline', 'Flight_Number_Reporting_Airline', 'Tail_Number', \n",
    "           'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Origin', 'Dest',\n",
    "           'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode', \n",
    "           'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']\n",
    "\n",
    "# Force dtypes for these columns - some choices are made to optimize disk space\n",
    "dtypes = {\n",
    "'Year': np.int16,\n",
    "'Month': np.int8,\n",
    "'DayofMonth': np.int16,\n",
    "'DayOfWeek': np.int8,\n",
    "'DepTime': 'Int16',\n",
    "'CRSDepTime': 'Int16',\n",
    "'ArrTime': 'Int16',\n",
    "'CRSArrTime': 'Int16',\n",
    "'Reporting_Airline': np.object,\n",
    "'Flight_Number_Reporting_Airline': np.int32,\n",
    "'Tail_Number': np.object,\n",
    "'ActualElapsedTime': 'Int32',\n",
    "'CRSElapsedTime': 'Int32',\n",
    "'AirTime': 'Int32',\n",
    "'ArrDelay': 'Int32',\n",
    "'DepDelay': 'Int32',\n",
    "'Origin': np.object,\n",
    "'Dest': np.object,\n",
    "'Distance': 'Int32',\n",
    "'TaxiIn': 'Int32',\n",
    "'TaxiOut': 'Int32',\n",
    "'Cancelled': 'Int8',\n",
    "'CancellationCode': np.object,\n",
    "'Diverted': 'Int8',\n",
    "'CarrierDelay': 'Int32',\n",
    "'WeatherDelay': 'Int32',\n",
    "'NASDelay': 'Int32',\n",
    "'SecurityDelay': 'Int32',\n",
    "'LateAircraftDelay': 'Int32',\n",
    "}\n",
    "\n",
    "# Set up a renaming dictionary, in order to make the column names to match the well known data from \n",
    "# http://stat-computing.org/dataexpo/2009/the-data.html\n",
    "rename_dict = {\n",
    "    'DayofMonth': 'DayOfMonth',\n",
    "    'Reporting_Airline': 'UniqueCarrier',\n",
    "    'Flight_Number_Reporting_Airline': 'FlightNum',\n",
    "    'Tail_Number': 'TailNum'\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Conversion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-19T10:09:44.179421Z",
     "start_time": "2019-09-19T10:09:44.172912Z"
    }
   },
   "outputs": [],
   "source": [
    "# Set up the list of zip files to be opened and converted\n",
    "zip_list = np.sort(np.array(glob.glob('./airlines-us-original/raw/*.zip')))[::-1]\n",
    "\n",
    "# The output directory\n",
    "output_dir = './airlines-us-original/hdf5/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-19T11:42:14.186704Z",
     "start_time": "2019-09-19T10:09:52.752002Z"
    }
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Converting to hdf5...', max=372, style=ProgressStyle(descript…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r"
     ]
    }
   ],
   "source": [
    "# The magic happens heree:\n",
    "for file in tqdm(zip_list, leave=False, desc='Converting to hdf5...'):\n",
    "    # Setting up the files, and directories\n",
    "    zip_file = ZipFile(file)\n",
    "    output_file = file.split('/')[-1][:-3]+'hdf5'\n",
    "    output = output_dir + output_file\n",
    "    \n",
    "    # Check if a converted file already exists: if it does skip it, otherwise read in the raw csv and convert it\n",
    "    if (os.path.exists(output) and os.path.isfile(output)):\n",
    "        pass\n",
    "    else:\n",
    "        # Importing the data into pandas \n",
    "        pandas_df = [pd.read_csv(zip_file.open(text_file.filename), \n",
    "                                 encoding='latin',\n",
    "                                 usecols=columns,\n",
    "                                 dtype=dtypes,)\n",
    "                     for text_file in zip_file.infolist()\n",
    "                     if text_file.filename.endswith('.csv')][0]\n",
    "        # Rename some columns to match the more well known dataset from \n",
    "        # http://stat-computing.org/dataexpo/2009/the-data.html\n",
    "        pandas_df.rename(columns=rename_dict, inplace=True)\n",
    "\n",
    "        # Importing the data from pandas to vaex\n",
    "        vaex_df = vaex.from_pandas(pandas_df, copy_index=False)\n",
    "        \n",
    "        # Export the data with vaex to hdf5\n",
    "        vaex_df.export_hdf5(path=output, progress=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Notes:\n",
    " - number of columns in the Data expo version: 29\n",
    "\n",
    "Here we go:\n",
    "1. Year -> Year\n",
    "- Month -> Month\n",
    "- DayOfMonth -> DayofMonth\n",
    "- DayOfWeek -> DayOfWeek\n",
    "- DepTime -> DepTime\n",
    "- CRSDepTime -> CRSDepTime\n",
    "- ArrTime -> ArrTime \n",
    "- CRSArrTime -> CRSArrTime \n",
    "- Reporting_Airline -> UniqueCarrier\n",
    "- Flight_Number_Reporting_Airline -> FlightNum\n",
    "- Tail_Number -> TailNum\n",
    "- ActualElapsedTime -> ActualElapsedTime\n",
    "- CRSElapsedTime -> CRSElapsedTime\n",
    "- AirTime -> AirTime\n",
    "- ArrDelay -> ArrDelay\n",
    "- DepDelay -> DepDelay\n",
    "- Origin -> Origin\n",
    "- Dest -> Dest\n",
    "- Distance -> Distance\n",
    "- TaxiIn -> TaxiIn\n",
    "- TaxiOut -> TaxiOut\n",
    "- Cancelled -> Cancelled\n",
    "- CancellationCode -> CancellationCode\n",
    "- Diverted -> Diverted\n",
    "- CarrierDelay -> CarrierDelay\n",
    "- WeatherDelay -> WeatherDelay \n",
    "- NASDelay -> NASDelay \n",
    "- SecurityDelay -> SecurityDelay\n",
    "- LateAircraftDelay -> LateAircraftDelay"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Now merge all the little files together\n",
    "\n",
    "Note that macOS has a rather low limit on the number of files one can open at one time. To circumvent this issue, in a terminal run:\n",
    "\n",
    "`>ulimit -n 9999`\n",
    "\n",
    "It should be the same terminal from which later the jupyter server is started.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-19T11:52:13.445615Z",
     "start_time": "2019-09-19T11:52:11.931233Z"
    }
   },
   "outputs": [],
   "source": [
    "import re\n",
    "import glob\n",
    "import vaex\n",
    "import numpy as np\n",
    "\n",
    "def tryint(s):\n",
    "    try:\n",
    "        return int(s)\n",
    "    except:\n",
    "        return s\n",
    "\n",
    "def alphanum_key(s):\n",
    "    \"\"\" Turn a string into a list of string and number chunks.\n",
    "        \"z23a\" -> [\"z\", 23, \"a\"]\n",
    "    \"\"\"\n",
    "    return [ tryint(c) for c in re.split('([0-9]+)', s) ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-19T11:52:13.454973Z",
     "start_time": "2019-09-19T11:52:13.446990Z"
    }
   },
   "outputs": [],
   "source": [
    "hdf5_list = glob.glob('./airlines-us-original/hdf5/*.hdf5')\n",
    "hdf5_list.sort(key=alphanum_key)\n",
    "hdf5_list = np.array(hdf5_list)\n",
    "\n",
    "assert len(hdf5_list) == 372, \"Incorrect number of files\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-19T12:07:34.862383Z",
     "start_time": "2019-09-19T11:58:04.846356Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[########################################]:  100.00% estimated time:        0s =  0.0m =  0.0h                                                                   "
     ]
    }
   ],
   "source": [
    "# This is an important step\n",
    "master_df = vaex.open_many(hdf5_list)\n",
    "\n",
    "# exporting\n",
    "master_df.export_hdf5(path='./airline_data_1988_2018.hd5', progress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-19T12:19:53.818836Z",
     "start_time": "2019-09-19T12:19:53.771996Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead>\n",
       "<tr><th>#                                      </th><th>Year  </th><th>Month  </th><th>DayOfMonth  </th><th>DayOfWeek  </th><th>UniqueCarrier  </th><th>TailNum  </th><th>FlightNum  </th><th>Origin  </th><th>Dest  </th><th>CRSDepTime  </th><th>DepTime  </th><th>DepDelay  </th><th>TaxiOut  </th><th>TaxiIn  </th><th>CRSArrTime  </th><th>ArrTime  </th><th>ArrDelay  </th><th>Cancelled  </th><th>CancellationCode  </th><th>Diverted  </th><th>CRSElapsedTime  </th><th>ActualElapsedTime  </th><th>AirTime  </th><th>Distance  </th><th>CarrierDelay  </th><th>WeatherDelay  </th><th>NASDelay  </th><th>SecurityDelay  </th><th>LateAircraftDelay  </th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "<tr><td><i style='opacity: 0.6'>0</i>          </td><td>1988  </td><td>1      </td><td>8           </td><td>5          </td><td>PI             </td><td>None     </td><td>930        </td><td>BGM     </td><td>ITH   </td><td>1525        </td><td>1532     </td><td>7         </td><td>--       </td><td>--      </td><td>1545        </td><td>1555     </td><td>10        </td><td>0          </td><td>None              </td><td>0         </td><td>20              </td><td>23                 </td><td>--       </td><td>32        </td><td>--            </td><td>--            </td><td>--        </td><td>--             </td><td>--                 </td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>1</i>          </td><td>1988  </td><td>1      </td><td>9           </td><td>6          </td><td>PI             </td><td>None     </td><td>930        </td><td>BGM     </td><td>ITH   </td><td>1525        </td><td>1522     </td><td>-3        </td><td>--       </td><td>--      </td><td>1545        </td><td>1535     </td><td>-10       </td><td>0          </td><td>None              </td><td>0         </td><td>20              </td><td>13                 </td><td>--       </td><td>32        </td><td>--            </td><td>--            </td><td>--        </td><td>--             </td><td>--                 </td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>2</i>          </td><td>1988  </td><td>1      </td><td>10          </td><td>7          </td><td>PI             </td><td>None     </td><td>930        </td><td>BGM     </td><td>ITH   </td><td>1525        </td><td>1522     </td><td>-3        </td><td>--       </td><td>--      </td><td>1545        </td><td>1534     </td><td>-11       </td><td>0          </td><td>None              </td><td>0         </td><td>20              </td><td>12                 </td><td>--       </td><td>32        </td><td>--            </td><td>--            </td><td>--        </td><td>--             </td><td>--                 </td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>3</i>          </td><td>1988  </td><td>1      </td><td>11          </td><td>1          </td><td>PI             </td><td>None     </td><td>930        </td><td>BGM     </td><td>ITH   </td><td>1525        </td><td>--       </td><td>--        </td><td>--       </td><td>--      </td><td>1545        </td><td>--       </td><td>--        </td><td>1          </td><td>None              </td><td>0         </td><td>20              </td><td>--                 </td><td>--       </td><td>32        </td><td>--            </td><td>--            </td><td>--        </td><td>--             </td><td>--                 </td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>4</i>          </td><td>1988  </td><td>1      </td><td>12          </td><td>2          </td><td>PI             </td><td>None     </td><td>930        </td><td>BGM     </td><td>ITH   </td><td>1525        </td><td>1524     </td><td>-1        </td><td>--       </td><td>--      </td><td>1545        </td><td>1540     </td><td>-5        </td><td>0          </td><td>None              </td><td>0         </td><td>20              </td><td>16                 </td><td>--       </td><td>32        </td><td>--            </td><td>--            </td><td>--        </td><td>--             </td><td>--                 </td></tr>\n",
       "<tr><td>...                                    </td><td>...   </td><td>...    </td><td>...         </td><td>...        </td><td>...            </td><td>...      </td><td>...        </td><td>...     </td><td>...   </td><td>...         </td><td>...      </td><td>...       </td><td>...      </td><td>...     </td><td>...         </td><td>...      </td><td>...       </td><td>...        </td><td>...               </td><td>...       </td><td>...             </td><td>...                </td><td>...      </td><td>...       </td><td>...           </td><td>...           </td><td>...       </td><td>...            </td><td>...                </td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>183,821,921</i></td><td>2018  </td><td>12     </td><td>27          </td><td>4          </td><td>DL             </td><td>N705TW   </td><td>864        </td><td>JFK     </td><td>SLC   </td><td>935         </td><td>929      </td><td>-6        </td><td>28       </td><td>9       </td><td>1311        </td><td>1230     </td><td>-41       </td><td>0          </td><td>None              </td><td>0         </td><td>336             </td><td>301                </td><td>264      </td><td>1990      </td><td>--            </td><td>--            </td><td>--        </td><td>--             </td><td>--                 </td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>183,821,922</i></td><td>2018  </td><td>12     </td><td>27          </td><td>4          </td><td>DL             </td><td>N336NB   </td><td>865        </td><td>MSP     </td><td>SLC   </td><td>1035        </td><td>1030     </td><td>-5        </td><td>14       </td><td>10      </td><td>1240        </td><td>1214     </td><td>-26       </td><td>0          </td><td>None              </td><td>0         </td><td>185             </td><td>164                </td><td>140      </td><td>991       </td><td>--            </td><td>--            </td><td>--        </td><td>--             </td><td>--                 </td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>183,821,923</i></td><td>2018  </td><td>12     </td><td>27          </td><td>4          </td><td>DL             </td><td>N945DN   </td><td>866        </td><td>DEN     </td><td>MSP   </td><td>1054        </td><td>1100     </td><td>6         </td><td>10       </td><td>5       </td><td>1353        </td><td>1350     </td><td>-3        </td><td>0          </td><td>None              </td><td>0         </td><td>119             </td><td>110                </td><td>95       </td><td>680       </td><td>--            </td><td>--            </td><td>--        </td><td>--             </td><td>--                 </td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>183,821,924</i></td><td>2018  </td><td>12     </td><td>27          </td><td>4          </td><td>DL             </td><td>N945DN   </td><td>866        </td><td>MSP     </td><td>DEN   </td><td>850         </td><td>850      </td><td>0         </td><td>15       </td><td>8       </td><td>1010        </td><td>951      </td><td>-19       </td><td>0          </td><td>None              </td><td>0         </td><td>140             </td><td>121                </td><td>98       </td><td>680       </td><td>--            </td><td>--            </td><td>--        </td><td>--             </td><td>--                 </td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>183,821,925</i></td><td>2018  </td><td>12     </td><td>27          </td><td>4          </td><td>DL             </td><td>N901DE   </td><td>867        </td><td>ATL     </td><td>SYR   </td><td>1455        </td><td>1500     </td><td>5         </td><td>19       </td><td>3       </td><td>1703        </td><td>1703     </td><td>0         </td><td>0          </td><td>None              </td><td>0         </td><td>128             </td><td>123                </td><td>101      </td><td>794       </td><td>--            </td><td>--            </td><td>--        </td><td>--             </td><td>--                 </td></tr>\n",
       "</tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "#            Year    Month    DayOfMonth    DayOfWeek    UniqueCarrier    TailNum    FlightNum    Origin    Dest    CRSDepTime    DepTime    DepDelay    TaxiOut    TaxiIn    CRSArrTime    ArrTime    ArrDelay    Cancelled    CancellationCode    Diverted    CRSElapsedTime    ActualElapsedTime    AirTime    Distance    CarrierDelay    WeatherDelay    NASDelay    SecurityDelay    LateAircraftDelay\n",
       "0            1988    1        8             5            PI               None       930          BGM       ITH     1525          1532       7           --         --        1545          1555       10          0            None                0           20                23                   --         32          --              --              --          --               --\n",
       "1            1988    1        9             6            PI               None       930          BGM       ITH     1525          1522       -3          --         --        1545          1535       -10         0            None                0           20                13                   --         32          --              --              --          --               --\n",
       "2            1988    1        10            7            PI               None       930          BGM       ITH     1525          1522       -3          --         --        1545          1534       -11         0            None                0           20                12                   --         32          --              --              --          --               --\n",
       "3            1988    1        11            1            PI               None       930          BGM       ITH     1525          --         --          --         --        1545          --         --          1            None                0           20                --                   --         32          --              --              --          --               --\n",
       "4            1988    1        12            2            PI               None       930          BGM       ITH     1525          1524       -1          --         --        1545          1540       -5          0            None                0           20                16                   --         32          --              --              --          --               --\n",
       "...          ...     ...      ...           ...          ...              ...        ...          ...       ...     ...           ...        ...         ...        ...       ...           ...        ...         ...          ...                 ...         ...               ...                  ...        ...         ...             ...             ...         ...              ...\n",
       "183,821,921  2018    12       27            4            DL               N705TW     864          JFK       SLC     935           929        -6          28         9         1311          1230       -41         0            None                0           336               301                  264        1990        --              --              --          --               --\n",
       "183,821,922  2018    12       27            4            DL               N336NB     865          MSP       SLC     1035          1030       -5          14         10        1240          1214       -26         0            None                0           185               164                  140        991         --              --              --          --               --\n",
       "183,821,923  2018    12       27            4            DL               N945DN     866          DEN       MSP     1054          1100       6           10         5         1353          1350       -3          0            None                0           119               110                  95         680         --              --              --          --               --\n",
       "183,821,924  2018    12       27            4            DL               N945DN     866          MSP       DEN     850           850        0           15         8         1010          951        -19         0            None                0           140               121                  98         680         --              --              --          --               --\n",
       "183,821,925  2018    12       27            4            DL               N901DE     867          ATL       SYR     1455          1500       5           19         3         1703          1703       0           0            None                0           128               123                  101        794         --              --              --          --               --"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check how the single file looks like:\n",
    "df = vaex.open('./airline_data_1988_2018.hd5')\n",
    "df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
